###Major League Baseball Data

MLB<-read.csv("Major League Baseball Main Stats Altered 2014.csv")
MLB

MLB<-MLB[,2:6]
MLB

library("MVA")


#perform PCA on the covariance matrix

cov.MLB<-cov(MLB)
cov.MLB
MLB_PCA=eigen(cov.MLB)

names(MLB_PCA$values)=names(MLB)

#set column names to the vector of eigen vectors
#set row names to the vector of eigen vectors
colnames(MLB_PCA$vectors)=names(MLB)
rownames(MLB_PCA$vectors)=names(MLB)



#scree plot 
MLB_PC.variance=MLB_PCA$values

(MLB_PC.prop=MLB_PC.variance/sum(MLB_PC.variance))


plot(1:length(MLB_PC.variance),MLB_PC.variance,main="Scree Plot for Major League Baseball - all variables, covariance matrix",
     xlab="Principal Component Number", ylab="Principal Component Variance",type="b")

#The scree plot shows that PCA on covariance matrix detects one principal  component.
#Mean Salary variable has the biggest proportion of the common variation.
#The rest of the PC are beyond the elbow.



#perform PCA on the correlation matrix
cor.MLB<-cor(MLB)
cor.MLB
MLB_PCA1=eigen(cor.MLB)

names(MLB_PCA1$values)=names(MLB)

#set column names to the vector of eigen vectors
#set row names to the vector of eigen vectors
colnames(MLB_PCA1$vectors)=names(MLB)
rownames(MLB_PCA1$vectors)=names(MLB)



#scree plot 
MLB_PC1.variance=MLB_PCA1$values
(MLB_PC1.prop=MLB_PC1.variance/sum(MLB_PC1.variance))

plot(1:length(MLB_PC1.variance),MLB_PC1.variance,main="Scree Plot for Major League Baseball - all variables, correlation matrix",
     xlab="Principal Component Number", ylab="Principal Component Variance",type="b")


#The scree plot shows that PCA on correlation matrix detects one principal component.
#Mean Salary variable has the biggest proportion of the common variation again.
#but there are two more PC's that are not beyond the elbow.



MLB$Earned.Run.Avg <- max(MLB$Earned.Run.Avg) - MLB$Earned.Run.Avg
MLB$Errors <- max(MLB$Errors) - MLB$Errors

cor.MLB<-cor(MLB)
cor.MLB
round(eigen(cor.MLB)$values,3)
round(eigen(cor.MLB)$vectors,3)


MLB_PC1.variance=eigen(cor.MLB)$values

plot(1:length(MLB_PC1.variance),MLB_PC1.variance,main="Scree Plot for Major League Baseball - all variables, correlation matrix",
     xlab="Principal Component Number", ylab="Principal Component Variance",type="b")



eigen(cor.MLB[2:5,2:5])

PC.variance<-eigen(cor.MLB)$values
PC.variables<-eigen(cor.MLB)$vectors
PC.var.prop<-PC.variance/sum(PC.variance)
PC.var.prop

M <- matrix(c(1,0,0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1,1,0,1,1,1,1,1),byrow=TRUE,nrow=5)
M

M%*%PC.var.prop

plot(1:5,PC.variance,main="Scree Plot for Baseball Data",
xlab="Principal Component Number", ylab="Principal Component Variance",type="b")

###Make the highly paid New York Yankees have a miserable year with 
###one of the lowest winning percentages and batting averages.
###Then redo the above analyses. 
###First, read in the data again, and adjust it as before.


MLB<-read.csv("Major League Baseball Main Stats Altered 2014.csv")
MLB

MLB<-MLB[,2:6]
MLB

library("MVA")

###Then change the statistics for the New York Yankees in Row 19. 

MLB[19,]
MLB[19,2] <- 0.377  ###The winning percentage for the Chicago Cubs
MLB[19,3] <- 0.245  ###The batting average for the Toronto Blue Jays
MLB[19,]

###Then proceed as before.

MLB$Earned.Run.Avg <- max(MLB$Earned.Run.Avg) - MLB$Earned.Run.Avg
MLB$Errors <- max(MLB$Errors) - MLB$Errors

cor.MLB<-cor(MLB)
cor.MLB
round(eigen(cor.MLB)$values,3)
round(eigen(cor.MLB)$vectors,3)

eigen(cor.MLB[2:5,2:5])

PC.variance<-eigen(cor.MLB)$values
PC.variables<-eigen(cor.MLB)$vectors
PC.var.prop<-PC.variance/sum(PC.variance)
PC.var.prop

M <- matrix(c(1,0,0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1,1,0,1,1,1,1,1),byrow=TRUE,nrow=5)
M

M%*%PC.var.prop

plot(1:5,PC.variance,main="Scree Plot for Baseball Data",
xlab="Principal Component Number", ylab="Principal Component Variance",type="b")